林嶔 (Lin, Chin)
Lesson 5
– 請至這裡下載本週的範例資料
dat = read.csv("Example data.csv", header = TRUE)
head(dat)
## eGFR Disease Survival.time Death Diabetes Cancer SBP DBP
## 1 34.65379 1 0.4771037 0 0 1 121.2353 121.3079
## 2 37.21183 1 3.0704424 0 1 1 122.2000 122.6283
## 3 32.60074 1 0.2607117 1 0 0 118.9136 121.7621
## 4 29.68481 1 NA NA 0 0 118.2212 112.7043
## 5 28.35726 0 0.1681673 1 0 0 116.7469 115.7705
## 6 33.95012 1 1.2238556 0 0 0 119.9936 116.3872
## Education Income
## 1 2 0
## 2 2 0
## 3 0 0
## 4 1 0
## 5 0 0
## 6 1 0
mean(dat$eGFR, na.rm = TRUE)
## [1] 34.14115
sd(dat$eGFR, na.rm = TRUE)
## [1] 7.062506
var(dat$eGFR, na.rm = TRUE)
## [1] 49.87899
median(dat$eGFR, na.rm = TRUE)
## [1] 34.33123
quantile(dat$eGFR, na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 10.00000 29.58404 34.33123 38.44965 60.00000
quantile(dat$eGFR, 0.5, na.rm = TRUE)
## 50%
## 34.33123
quantile(dat$eGFR, 0.95, na.rm = TRUE)
## 95%
## 45.86468
min(dat$eGFR, na.rm = TRUE)
## [1] 10
max(dat$eGFR, na.rm = TRUE)
## [1] 60
table(dat$Cancer)
##
## 0 1
## 576 402
table(dat$Cancer, dat$Diabetes)
##
## 0 1
## 0 468 97
## 1 309 81
tab1 = table(dat$Cancer)
prop.table(tab1)
##
## 0 1
## 0.5889571 0.4110429
tab2 = table(dat$Cancer, dat$Diabetes)
prop.table(tab2)
##
## 0 1
## 0 0.49005236 0.10157068
## 1 0.32356021 0.08481675
prop.table(tab2, 1)
##
## 0 1
## 0 0.8283186 0.1716814
## 1 0.7923077 0.2076923
prop.table(tab2, 2)
##
## 0 1
## 0 0.6023166 0.5449438
## 1 0.3976834 0.4550562
mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)
## [1] 34.41452
mean(dat[dat[,"Disease"] == 0,]$eGFR, na.rm = TRUE)
## [1] 33.19008
paste(mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), "±", sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), sep = "")
## [1] "34.4145222034604±7.1582590897065"
paste0(mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE), "±", sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE))
## [1] "34.4145222034604±7.1582590897065"
m = mean(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)
s = sd(dat[dat[,"Disease"] == 1,]$eGFR, na.rm = TRUE)
formatC(m, digits = 3, format = "f")
## [1] "34.415"
formatC(s, digits = 3, format = "f")
## [1] "7.158"
paste0(formatC(m, digits = 3, format = "f"), "±", formatC(s, digits = 3, format = "f"))
## [1] "34.415±7.158"
函數之input必須為一類別變項及一連續變項
函數中必須設定一個變項,讓使用者能決定顯示小數點的位數
3-1. 第一個函數必須輸出在該類別變項為不同類別時,該連續變項之『平均數±標準差』
3-2. 第二個函數必須輸出在該類別變項為不同類別時,該連續變項之『中位數(25百分位-75百分位)』
– 注意:請考慮類別數不只是2的情形,以及類別項目並非為0、1、2、…的狀況
– 剩下的其實就是把前面的東西貼起來:
my_func.1 = function (x, y, dig = 3) {
lvl.x = levels(factor(x))
n.lvl.x = length(lvl.x)
result = character(n.lvl.x)
for (i in 1:n.lvl.x) {
m = mean(y[x == lvl.x[i]], na.rm = TRUE)
s = sd(y[x == lvl.x[i]], na.rm = TRUE)
result[i] = paste0(formatC(m, digits = dig, format = "f"), "±", formatC(s, digits = dig, format = "f"))
}
result
}
my_func.2 = function (x, y, dig = 3) {
lvl.x = levels(factor(x))
n.lvl.x = length(lvl.x)
result = character(n.lvl.x)
for (i in 1:n.lvl.x) {
m = median(y[x == lvl.x[i]], na.rm = TRUE)
q1 = quantile(y[x == lvl.x[i]], 0.25, na.rm = TRUE)
q2 = quantile(y[x == lvl.x[i]], 0.75, na.rm = TRUE)
result[i] = paste0(formatC(m, digits = dig, format = "f"), "(", formatC(q1, digits = dig, format = "f"), "-", formatC(q2, digits = dig, format = "f"), ")")
}
result
}
my_func.1(dat$Education, dat$SBP)
## [1] "121.365±10.568" "121.406±10.920" "122.853±10.532"
my_func.2(dat$Income, dat$eGFR, dig = 1)
## [1] "34.3(29.5-38.2)" "33.8(28.9-38.8)" "34.9(30.4-40.0)"
– 列表(List)層分為列表(list)、S3物件(S3 class)及S4物件(S4 class):
列表(list):在R裡面,向量的上層是陣列層物件。若是我們希望在一個物件內放置很多陣列層物件,我們會用到列表。値得一提的是,列表裡面可以同時包含數個陣列層物件及變數層物件。
S3物件(S3 class):S3物件是一種特殊的列表物件,他的變化會在後面慢慢介紹。
S4物件(S4 class):S4物件與前面兩種有非常大的不同,相關的函數也不一樣,在本節課我們不會教到。
# 先產生一個數値矩陣物件
x1 = 1:20
M1 = matrix(x1, nrow = 4, ncol = 5)
M1
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 5 9 13 17
## [2,] 2 6 10 14 18
## [3,] 3 7 11 15 19
## [4,] 4 8 12 16 20
# 再產生一個文字矩陣物件
x2 = c("A", "B", "C", "A", "C", "B", "B", "B", "A")
M2 = matrix(x2, nrow = 3, ncol = 3)
M2
## [,1] [,2] [,3]
## [1,] "A" "A" "B"
## [2,] "B" "C" "B"
## [3,] "C" "B" "A"
# 再產生一個邏輯向量
x3 = c(TRUE, FALSE, TRUE, FALSE)
x3
## [1] TRUE FALSE TRUE FALSE
# 將上述這些物件打包成一個列表物件
L1 = list(M1, M2, x3)
L1
## [[1]]
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 5 9 13 17
## [2,] 2 6 10 14 18
## [3,] 3 7 11 15 19
## [4,] 4 8 12 16 20
##
## [[2]]
## [,1] [,2] [,3]
## [1,] "A" "A" "B"
## [2,] "B" "C" "B"
## [3,] "C" "B" "A"
##
## [[3]]
## [1] TRUE FALSE TRUE FALSE
函數「length()」可以協助我們了解物件長度
函數「class()」可以查詢該物件的屬性
函數「names()」可以協助我們命名物件
函數「ls()」可以協助我們看看物件中有哪些東西
length(L1)
## [1] 3
class(L1)
## [1] "list"
names(L1) = c("A", "B", "C")
L1
## $A
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 5 9 13 17
## [2,] 2 6 10 14 18
## [3,] 3 7 11 15 19
## [4,] 4 8 12 16 20
##
## $B
## [,1] [,2] [,3]
## [1,] "A" "A" "B"
## [2,] "B" "C" "B"
## [3,] "C" "B" "A"
##
## $C
## [1] TRUE FALSE TRUE FALSE
ls(L1)
## [1] "A" "B" "C"
L1[[2]]
## [,1] [,2] [,3]
## [1,] "A" "A" "B"
## [2,] "B" "C" "B"
## [3,] "C" "B" "A"
L1[["B"]]
## [,1] [,2] [,3]
## [1,] "A" "A" "B"
## [2,] "B" "C" "B"
## [3,] "C" "B" "A"
L1$B
## [,1] [,2] [,3]
## [1,] "A" "A" "B"
## [2,] "B" "C" "B"
## [3,] "C" "B" "A"
L1[[2]][2,3]
## [1] "B"
L1[["B"]][3,1]
## [1] "C"
L1$B[1,2]
## [1] "A"
經過了上述的示範後,我們了解到列表(list)是一個很方便的物件,它可以把很多很雜的東西丟在同個物件內。但東西多了以後會遇到問題,那就是該列表物件會變的非常非常大,但也許我們想要呈現的東西是很有限的,在R裡面,列表有一種擴展型態叫做S3物件(S3 class),它可以解決這個問題。
S3物件(S3 class)的產生方式如下
#先看看L1的樣子
L1
## $A
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 5 9 13 17
## [2,] 2 6 10 14 18
## [3,] 3 7 11 15 19
## [4,] 4 8 12 16 20
##
## $B
## [,1] [,2] [,3]
## [1,] "A" "A" "B"
## [2,] "B" "C" "B"
## [3,] "C" "B" "A"
##
## $C
## [1] TRUE FALSE TRUE FALSE
#先看看L1的物件屬性
class(L1)
## [1] "list"
#強迫L1成為別的物件屬性
class(L1) = "test"
#再看看L1的物件屬性
class(L1)
## [1] "test"
#看看L1現在的樣子
L1
## $A
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 5 9 13 17
## [2,] 2 6 10 14 18
## [3,] 3 7 11 15 19
## [4,] 4 8 12 16 20
##
## $B
## [,1] [,2] [,3]
## [1,] "A" "A" "B"
## [2,] "B" "C" "B"
## [3,] "C" "B" "A"
##
## $C
## [1] TRUE FALSE TRUE FALSE
##
## attr(,"class")
## [1] "test"
– 小提示:當你使用函數「class()」可以查詢該物件的屬性,若非常見的幾種屬性名稱,那就非常有可能是S3物件(S3 class)或S4物件(S4 class)
#先寫一個自訂函數「print.test()」
print.test = function(test) {
cat("此列表共有",length(test),"個物件\n")
cat("物件名稱分別為:\n")
cat(paste(names(test), collapse = ", "), "\n")
}
#再看看請R列印出L1會變什麼
L1
## 此列表共有 3 個物件
## 物件名稱分別為:
## A, B, C
– 列表(list)的幾個常見函數還是能夠使用:
ls(L1)
## [1] "A" "B" "C"
length(L1)
## [1] 3
class(L1)
## [1] "test"
names(L1) = c("D", "E", "F")
L1
## 此列表共有 3 個物件
## 物件名稱分別為:
## D, E, F
– 在寫之前我們先看看直接對L1使用函數「summary()」會怎樣
summary(L1)
## Length Class Mode
## D 20 -none- numeric
## E 9 -none- character
## F 4 -none- logical
– 現在我們可以讓函數「summary()」使用後產生不同的結果
#先寫一個自訂函數「summary.test()」
summary.test = function(test) {
cat("此列表共有",length(test),"個物件\n")
cat("物件名稱分別為:\n")
cat(paste(names(test), collapse = ", "), "\n")
for (i in 1:length(test)) {
cat(names(test)[i], "之物件屬性為", class(test[[i]]), "\n")
}
}
#再看看使用函數「summary()」後會變什麼
summary(L1)
## 此列表共有 3 個物件
## 物件名稱分別為:
## D, E, F
## D 之物件屬性為 matrix
## E 之物件屬性為 matrix
## F 之物件屬性為 logical
我們已經學會如何將想要的資訊放在列表(list)物件中,並透過將這個物件轉換為一個特定的S3物件(S3 class) 後,就可以透過自訂函數「print.XXX()」呈現想要的結果。
現在請延續練習-1的功課,我們要額外增加一些要求:
平均數必須儲存在列表(list)物件的第一個位置,並且必須儲存為一『數値變數』,長度不限
標準差必須儲存在列表(list)物件的第二個位置,並且必須儲存為一『數値變數』,長度不限
各組別樣本數必須儲存在列表(list)物件的第三個位置,並且必須儲存為一『數値變數』,長度不限
列表(list)物件的第四個位置必須放置『平均數±標準差』,並且必須儲存為一『文字變數』,長度不限
列表(list)物件的第五個位置必須放置當初輸入的類別變項之『各類別名稱』,並且必須儲存為一『文字變數』,長度不限
在使用函數「print()」時,我想看到連續幾行的『組別:‘XXX’之平均數±標準差為’XXX±XXX’(個案數=XXX)』。
my_func.S3 = function (x, y, dig = 3) {
lvl.x = levels(factor(x))
n.lvl.x = length(lvl.x)
my_list = list()
my_list[[1]] = numeric(n.lvl.x)
my_list[[2]] = numeric(n.lvl.x)
my_list[[3]] = numeric(n.lvl.x)
my_list[[4]] = character(n.lvl.x)
my_list[[5]] = character(n.lvl.x)
names(my_list) = c("mean", "sd", "n", "description", "name")
for (i in 1:n.lvl.x) {
current_vec = y[x == lvl.x[i]]
my_list[[1]][i] = mean(current_vec, na.rm = TRUE)
my_list[[2]][i] = sd(current_vec, na.rm = TRUE)
my_list[[3]][i] = sum(table(current_vec))
my_list[[4]][i] = paste0(formatC(my_list[[1]][i], digits = 3, format = "f"), "±", formatC(my_list[[2]][i], digits = 3, format = "f"))
my_list[[5]][i] = lvl.x[i]
}
class(my_list) = 'myS3'
my_list
}
print.myS3 = function (x) {
n.lvl.x = length(x[["name"]])
for (i in 1:n.lvl.x) {
cat(paste0("組別:", x[["name"]][i], "之平均數±標準差為", x[["description"]][i], "(個案數=", x[["n"]][i], ")\n"))
}
}
my_list = my_func.S3(dat$Education, dat$SBP)
ls(my_list)
## [1] "description" "mean" "n" "name" "sd"
my_list
## 組別:0之平均數±標準差為121.365±10.568(個案數=488)
## 組別:1之平均數±標準差為121.406±10.920(個案數=284)
## 組別:2之平均數±標準差為122.853±10.532(個案數=184)
– 這時候我們需要使用函數「t.test()」及「wilcox.test()」
result1 = t.test(dat[,"eGFR"]~dat[,"Diabetes"], var.equal = FALSE)
result1
##
## Welch Two Sample t-test
##
## data: dat[, "eGFR"] by dat[, "Diabetes"]
## t = -12.506, df = 254.94, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -7.967643 -5.799633
## sample estimates:
## mean in group 0 mean in group 1
## 32.87223 39.75587
result2 = wilcox.test(dat[,"eGFR"]~dat[,"Diabetes"])
result2
##
## Wilcoxon rank sum test with continuity correction
##
## data: dat[, "eGFR"] by dat[, "Diabetes"]
## W = 29815, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
class(result1)
## [1] "htest"
class(result2)
## [1] "htest"
ls(result1)
## [1] "alternative" "conf.int" "data.name" "estimate" "method"
## [6] "null.value" "parameter" "p.value" "statistic"
ls(result2)
## [1] "alternative" "data.name" "method" "null.value" "parameter"
## [6] "p.value" "statistic"
var.result = var.test(dat[,"eGFR"]~dat[,"Diabetes"])
var.result
##
## F test to compare two variances
##
## data: dat[, "eGFR"] by dat[, "Diabetes"]
## F = 1.0183, num df = 767, denom df = 171, p-value = 0.8997
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.7979204 1.2767309
## sample estimates:
## ratio of variances
## 1.018323
result3 = t.test(dat[,"eGFR"]~dat[,"Diabetes"], var.equal = TRUE)
result3
##
## Two Sample t-test
##
## data: dat[, "eGFR"] by dat[, "Diabetes"]
## t = -12.434, df = 938, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -7.970133 -5.797143
## sample estimates:
## mean in group 0 mean in group 1
## 32.87223 39.75587
函數之input必須為一『類別變項』及一『連續變項』
先做一個檢查,如果該『類別變項』之類別數不等於2,則不執行函數或產生警告
先檢查兩組個案數是否皆>25,如果是就做t test,否則就做Wilcoxon test。
如果選擇做做t test,先確定變異數是否同質,並依據變異數同值性檢定的結果,決定要使用哪一種t test。
my_func.analysis = function (x, y) {
lvl.x = levels(factor(x))
n.lvl.x = length(lvl.x)
if (n.lvl.x != 2) {
cat("『類別變項』之類別數不等於2\n")
} else {
n.each = numeric(n.lvl.x)
for (i in 1:n.lvl.x) {
current_vec = y[x == lvl.x[i]]
n.each[i] = sum(table(current_vec))
}
if (FALSE %in% (n.each > 25)) {
result = wilcox.test(y~x)
} else {
var.result = var.test(y~x)
if (var.result[['p.value']] < 0.05) {
result = t.test(y~x, var.equal = FALSE)
} else {
result = t.test(y~x, var.equal = TRUE)
}
}
result
}
}
my_func.analysis(dat$Income, dat$eGFR)
## 『類別變項』之類別數不等於2
my_func.analysis(dat$Disease, dat$eGFR)
##
## Welch Two Sample t-test
##
## data: y by x
## t = -2.4556, df = 466.06, p-value = 0.01443
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.2043011 -0.2445876
## sample estimates:
## mean in group 0 mean in group 1
## 33.19008 34.41452
my_func.analysis(dat[dat$Income == 2,]$Disease, dat[dat$Income == 2,]$eGFR)
##
## Wilcoxon rank sum test with continuity correction
##
## data: y by x
## W = 691, p-value = 0.2357
## alternative hypothesis: true location shift is not equal to 0
我們需要使用函數「aov()」及「anova()」執行ANOVA
函數「kruskal.test()」可以幫助我們做Kruskal-Wallis test
Variance.table = aov(dat[,"eGFR"]~as.factor(dat[,"Education"]))
Variance.table
## Call:
## aov(formula = dat[, "eGFR"] ~ as.factor(dat[, "Education"]))
##
## Terms:
## as.factor(dat[, "Education"]) Residuals
## Sum of Squares 213.93 47107.75
## Deg. of Freedom 2 946
##
## Residual standard error: 7.056683
## Estimated effects may be unbalanced
## 51 observations deleted due to missingness
ANOVA.table = anova(Variance.table)
ANOVA.table
## Analysis of Variance Table
##
## Response: dat[, "eGFR"]
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(dat[, "Education"]) 2 214 106.966 2.148 0.1173
## Residuals 946 47108 49.797
KW.result = kruskal.test(dat[,"eGFR"]~dat[,"Education"])
KW.result
##
## Kruskal-Wallis rank sum test
##
## data: dat[, "eGFR"] by dat[, "Education"]
## Kruskal-Wallis chi-squared = 4.519, df = 2, p-value = 0.1044
先檢查類別數是否>=3,若是則使用ANOVA或Kruskal-Wallis test,否則則使用t test或Wilcoxon test。
檢查各組內個案數是否皆>25,如果是就做ANOVA,否則就做Kruskal-Wallis test。
my_func.analysis = function (x, y) {
lvl.x = levels(factor(x))
n.lvl.x = length(lvl.x)
n.each = numeric(n.lvl.x)
for (i in 1:n.lvl.x) {
current_vec = y[x == lvl.x[i]]
n.each[i] = sum(table(current_vec))
}
if (n.lvl.x > 2) {
if (FALSE %in% (n.each > 25)) {
result = kruskal.test(y~x)
} else {
Variance.table = aov(y~as.factor(x))
result = anova(Variance.table)
}
} else {
if (FALSE %in% (n.each > 25)) {
result = wilcox.test(y~x)
} else {
var.result = var.test(y~x)
if (var.result[['p.value']] < 0.05) {
result = t.test(y~x, var.equal = FALSE)
} else {
result = t.test(y~x, var.equal = TRUE)
}
}
}
result
}
my_func.analysis(dat$Income, dat$eGFR)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(x) 2 102 51.016 1.0115 0.3641
## Residuals 944 47613 50.438
my_func.analysis(dat[dat$Income == 2,]$Education, dat[dat$Income == 2,]$eGFR)
##
## Kruskal-Wallis rank sum test
##
## data: y by x
## Kruskal-Wallis chi-squared = 8.7301, df = 2, p-value = 0.01271
– 以下範例為自變項為糖尿病,依變項為eGFR。
## Variable Group:0 Group:1 p-value
## 1 "eGFR" "32.9±6.6" "39.8±6.5" "<0.001"
.ROUND.p = function(X, p.digits=3) {
p=rep(NA,length(X))
for (a in 1:length(X)) {
if (!is.na(X[a])) {
if (p.digits==0) {p[a]="<1"} else {
p[a]=as.character(round(X[a],p.digits))
if (p[a]=="0") {
if (p.digits==1) {p[a]="<0.1"} else {
p[a]="<0."
for (l in 1:(p.digits-1)) {
p[a]=paste0(p[a],0)
}
p[a]=paste0(p[a],1)
}
} else {
p[a]=formatC(as.numeric(X[a]), format="f", digits = p.digits)
}
}
}
}
return(p)
}
.ROUND.p(0.000518, 2)
## [1] "<0.01"
.ROUND.p(0.000518, 3)
## [1] "0.001"
.ROUND.p(0.000518, 4)
## [1] "0.0005"
本次課程中同學學習到最重要的是列表層物件的概述,其中最重要的應用為S3物件。
除此之外,開始會一些基本的統計檢定函數。
結合之前的課程,我們將可以自動做出符合我們預先設定格式的結果。